Dataset from Airbnb Data Open Resources: http://insideairbnb.com/get-the-data.html

#############################################
         # Library Requirement #
#############################################
library(tidytext) # Package tidytext for conversion of text to and from tidy formats
library(dplyr) # Package dplyr is for data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse) # Collection of R packages designed for data works harmoniously with other packages
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.3
## ✓ tibble  3.0.4     ✓ stringr 1.4.0
## ✓ tidyr   1.1.0     ✓ forcats 0.5.1
## ✓ readr   1.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr) #  Package readr is to provide a fast and friendly way to read rectangular data (like csv, tsv, and fwf).
# install.packages("visdat") 
library(visdat) # Package for visulizing plot of the missing data 
library(ggplot2) # Package for multiple ploting
library(DT) # Package for HTML display of data
library(corrplot) # Package for correlation analysis, confidence interval
## corrplot 0.84 loaded
# install.packages("hrbrthemes")
library(hrbrthemes)# A compilation of extra 'ggplot2' themes, scales and utilities, including a spell check function for plot label fields and an overall emphasis on typography
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(cowplot) # he "cowplot" package is a simple add-on to ggplot. It provides various features that help with creating publication-quality figures, such as a set of themes
# install.packages("webmap")
library(ggmap) # Package ggmap is a collection of functions to visualize spatial data and models on top of static maps from various online sources (e.g Google Maps)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:cowplot':
## 
##     theme_nothing
#############################################
          # Data Preparation #
#############################################
Airbnb_Shanghai_2021 <- read_csv("/Users/wqr/Desktop/MSDS\ 597\ Final\ Project/Shanghai\ Dataset/listings.csv")
## 
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────
## cols(
##   id = col_double(),
##   name = col_character(),
##   host_id = col_double(),
##   host_name = col_character(),
##   neighbourhood_group = col_logical(),
##   neighbourhood = col_character(),
##   latitude = col_double(),
##   longitude = col_double(),
##   room_type = col_character(),
##   price = col_double(),
##   minimum_nights = col_double(),
##   number_of_reviews = col_double(),
##   last_review = col_date(format = ""),
##   reviews_per_month = col_double(),
##   calculated_host_listings_count = col_double(),
##   availability_365 = col_double()
## )
head(Airbnb_Shanghai_2021, 100) # Check the dataset
## # A tibble: 100 x 16
##        id name  host_id host_name neighbourhood_g… neighbourhood latitude
##     <dbl> <chr>   <dbl> <chr>     <lgl>            <chr>            <dbl>
##  1  24963 Hear…   98203 Jia       NA               徐汇区 / Xuhui …     31.2
##  2  24991 Fren…   98203 Jia       NA               徐汇区 / Xuhui …     31.2
##  3 139828 【sid…  681552 Leon      NA               普陀区 / Putuo …     31.2
##  4 161932 Subl…  774393 Michael   NA               静安区 / Jing'a…     31.2
##  5 185736 Apt …  891951 Maggie    NA               徐汇区 / Xuhui …     31.2
##  6 350728 'Lao… 1777552 Nitin     NA               长宁区 / Changn…     31.2
##  7 427038 In t… 2122588 Mia       NA               黄浦区 / Huangp…     31.2
##  8 479517 有简约 …  681552 Leon      NA               静安区 / Jing'a…     31.2
##  9 479530 【sid…  681552 Leon      NA               静安区 / Jing'a…     31.2
## 10 496972 Free… 2454164 Alvin     NA               杨浦区 / Yangpu…     31.3
## # … with 90 more rows, and 9 more variables: longitude <dbl>, room_type <chr>,
## #   price <dbl>, minimum_nights <dbl>, number_of_reviews <dbl>,
## #   last_review <date>, reviews_per_month <dbl>,
## #   calculated_host_listings_count <dbl>, availability_365 <dbl>
dim(Airbnb_Shanghai_2021) # To understand the dimention of the dataset
## [1] 36294    16
vis_miss(Airbnb_Shanghai_2021) # To view the missing values in the dataset and creat a visulization graph

datatable(Airbnb_Shanghai_2021 ,extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = I('colvis')))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
#############################################
     # Correlation Metrics Analysis #
#############################################
Airbnb_Shanghai_2021_cor <- Airbnb_Shanghai_2021[, sapply(Airbnb_Shanghai_2021, is.numeric)]
Airbnb_Shanghai_2021_cor <- Airbnb_Shanghai_2021_cor[complete.cases(Airbnb_Shanghai_2021_cor), ]
correlation_matrix <- cor(Airbnb_Shanghai_2021_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")

#############################################
# Exploratory Data Analysis #
#############################################
### Disctribution of Shanghai Airbnb Price
background_canvas<- theme(panel.grid.major =element_blank(),
                           panel.grid.minor =element_blank(), 
                           panel.background =element_blank(), 
                           axis.line.x =element_line(color ="black"),
                           axis.line.y =element_line(color ="black"),
                           legend.key =element_rect(fill ="white"),
                           text =element_text(size =15))

par(mfrow=c(2,1))
ggplot(Airbnb_Shanghai_2021) + 
  background_canvas+
  geom_histogram(aes(price),fill = 'orange',alpha = 0.85,binwidth = 15) + 
  theme_minimal(base_size = 13) + xlab("Price") + ylab("Frequency") + 
  ggtitle("The Distrubition of Price in Shanghai 2021") 

 ### Transformed distribution of Shanghai Airbnb Price with log10 transformation of x-axis
#Transformed distribution of Price
ggplot(Airbnb_Shanghai_2021, aes(price)) +
  background_canvas+
  geom_histogram(bins = 30, aes(y = ..density..), color = "black", fill = "orange") + 
  geom_density(alpha = 0.2, color = "red") +
  ggtitle("Transformed distribution of price (Display in RMB, 1$≈6.5RMB )",
  subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) + 
  scale_x_log10()+
  geom_vline(xintercept = round(mean(Airbnb_Shanghai_2021$price), 2), size = 1, linetype = 3) +
  scale_x_log10() +
  annotate("text", x = 1800, y = 0.75,label = paste("Mean price = ", paste0(round(mean(Airbnb_Shanghai_2021$price), 2), "RMB")),
           color =  "#32CD32", size = 6)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing non-finite values (stat_density).

#############################################
      #  neighborhood mean price #
#############################################
airbnb_neighbourhood <- Airbnb_Shanghai_2021 %>%
  group_by(neighbourhood) %>%
  summarise(price = round(mean(price), 2))


ggplot(Airbnb_Shanghai_2021, aes(price)) +
  geom_histogram(bins = 30, aes(y = ..density..), fill = "orange") + 
  geom_density(alpha = 0.2, fill = "red") +
  background_canvas +
  ggtitle("Transformed distribution of price\n by neighbourhood groups",
          subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
  geom_vline(data = airbnb_neighbourhood, aes(xintercept = price), size = 1, linetype = 3) +
  geom_text(data = airbnb_neighbourhood,y = 1.5, aes(x = price + 1400, label = paste("Mean  = ",price)), color = "#32CD32", size = 3) +
  facet_wrap(~neighbourhood) +
  scale_x_log10() 
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing non-finite values (stat_density).

#############################################
# Average Price by Different/Room Type #
#############################################
mean_room_type <- aggregate(list(average_price = Airbnb_Shanghai_2021$price),
                            list(room_type = Airbnb_Shanghai_2021$room_type), mean)
mean_room_type
##         room_type average_price
## 1 Entire home/apt      923.8280
## 2      Hotel room        0.0000
## 3    Private room      523.0714
## 4     Shared room      399.3215
### Average Price by Room Type
ggplot(data = mean_room_type, aes(x=room_type, y=average_price))+
  coord_flip()+
  geom_segment(aes(xend=room_type, yend=0, color = room_type), size = 2) +
  geom_point(size=6, mapping = aes(color = room_type))+
  # theme_minimal()+
  geom_text(aes(label=average_price), vjust = -1.5)+
  background_canvas+
   labs(title = "Average price by Room type in Shanghai 2021",
       x = "Room Type", y = "Average Price")

#############################################
      #  Neighborhood Analysis #
#############################################
# Take a look at the number of rental airbnbs in each neighbourhood, It is a long list and in the picture below, only the higher frequency end is displayed.
freq_area <- data.frame(cbind(Frequency = table(Airbnb_Shanghai_2021$neighbourhood), Percent = prop.table(table(Airbnb_Shanghai_2021$neighbourhood)) * 100))
freq_area <- freq_area[order(freq_area$Frequency),]
freq_area
##                             Frequency    Percent
## 金山区 / Jinshan District         179  0.4931945
## 奉贤区 / Fengxian District        277  0.7632116
## 宝山区 / Baoshan District         641  1.7661321
## 嘉定区 / Jiading District         808  2.2262633
## 普陀区 / Putuo District           828  2.2813688
## 杨浦区 / Yangpu District          888  2.4466854
## 虹口区 / Hongkou District         946  2.6064914
## 青浦区 / Qingpu District         1143  3.1492809
## 松江区 / Songjiang District      1232  3.3945005
## 崇明区 / Chongming District      1308  3.6039015
## 长宁区 / Changning District      1358  3.7416653
## 闵行区 / Minhang District        2317  6.3839753
## 静安区 / Jing'an District        2321  6.3949964
## 徐汇区 / Xuhui District          3671 10.1146195
## 黄浦区 / Huangpu District        4500 12.3987436
## 浦东新区 / Pudong               13877 38.2349700
tema <- theme(plot.title = element_text(size = 18, hjust = .5),
              axis.text.x = element_text(size = 8, angle=45, face = "bold"),
              axis.text.y = element_text(size = 8, angle=10, face = "bold"),
              axis.title.x = element_text(size = 7),
              axis.title.y = element_text(size = 7),
              legend.text = element_text(size = 14, face = "bold"))

freq_area_df <- data.frame(neighbourhood = row.names(tail(freq_area, 10)), Frequency = tail(freq_area, 10)$Frequency)
freq_area_df
##                  neighbourhood Frequency
## 1    虹口区 / Hongkou District       946
## 2     青浦区 / Qingpu District      1143
## 3  松江区 / Songjiang District      1232
## 4  崇明区 / Chongming District      1308
## 5  长宁区 / Changning District      1358
## 6    闵行区 / Minhang District      2317
## 7    静安区 / Jing'an District      2321
## 8      徐汇区 / Xuhui District      3671
## 9    黄浦区 / Huangpu District      4500
## 10           浦东新区 / Pudong     13877
options(repr.plot.width=20, repr.plot.height=10)
ggplot(data = freq_area_df, mapping = aes(x = neighbourhood, y = Frequency)) +
  theme_minimal() + 
  geom_point(size = 4, color = "darkblue") +
  ggtitle("TOP 10 most frequent neighbourhood in Shanghai City") +
  xlab("") +
  geom_line(color = "black", size = 1, linetype= 17, group = 2, alpha = .5) + 
  geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +   
  tema

### Take a look at the top 10 neighborhoods 
#### Top10 Highest Prices
top_10_neighbourhood <- aggregate(list(Airbnb_Shanghai_2021$price), list(Airbnb_Shanghai_2021$neighbourhood), mean)
colnames(top_10_neighbourhood) <- c("neighbourhood", "Average_price_per_neighborhood")
top_10_neighbourhood <- top_10_neighbourhood[order(top_10_neighbourhood$Average_price_per_neighborhood),]
top_10_neighbourhood <- tail(top_10_neighbourhood, 12)
top_10_neighbourhood <- head(top_10_neighbourhood, 10)
r <- c()
for(i in 10:1){r <- c(r, i)}
row.names(top_10_neighbourhood) <- r
top_10_neighbourhood
##                  neighbourhood Average_price_per_neighborhood
## 10   闵行区 / Minhang District                       604.6832
## 9    静安区 / Jing'an District                       643.1443
## 8      徐汇区 / Xuhui District                       648.9120
## 7  长宁区 / Changning District                       686.5140
## 6    金山区 / Jinshan District                       694.3520
## 5      普陀区 / Putuo District                       706.1377
## 4    黄浦区 / Huangpu District                       714.2647
## 3            浦东新区 / Pudong                       765.8575
## 2   奉贤区 / Fengxian District                       822.6498
## 1  松江区 / Songjiang District                      1019.5917
tema <- theme(
  plot.title = element_text(size = 15, hjust = .5),
  axis.text.x = element_text(size = 6, face = "bold"),
  axis.text.y = element_text(size = 6, face = "bold"),
  axis.title.x = element_text(size = 7),
  axis.title.y = element_text(size = 7),
  legend.position = "none")

tema1 <- theme(
  plot.title = element_text(size = 15, hjust = .5),
  axis.text.x = element_text(size = 6, face = "bold"),
  axis.text.y = element_text(size = 6, face = "bold"),
  axis.title.x = element_text(size = 7),
  axis.title.y = element_text(size = 7),
  legend.position="none")

options(repr.plot.width=20, repr.plot.height=11)
most_expensive_plot_a <- ggplot(data = top_10_neighbourhood, mapping = aes(x = neighbourhood, y = Average_price_per_neighborhood)) +
  geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +
  geom_label(mapping = aes(label = round(Average_price_per_neighborhood, 2)), size = 3, fill = "#F5FFFA", fontface = "bold") +
  coord_flip() +
  theme_ipsum() + 
  ggtitle("TOP 10 most expensive neighborhoods in Shanghai City") +
  xlab("") +
  ylab("") +
  tema

most_expensive_plot_b <- ggplot(data = top_10_neighbourhood, mapping = aes(x = neighbourhood, y = Average_price_per_neighborhood)) +
  geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +
  theme_ipsum() + 
  ggtitle("TOP 10 most expensive neighborhoods in Shanghai City") +
  xlab("") +
  ylab("") +
  tema1

plot_grid(most_expensive_plot_a, most_expensive_plot_b + coord_polar(), ncol=2, nrow=1)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

### Top 10 Lowest Neighborhoods
top_10_lowest_neighbourhood <- aggregate(list(Airbnb_Shanghai_2021$price), list(Airbnb_Shanghai_2021$neighbourhood), mean)
colnames(top_10_lowest_neighbourhood) <- c("neighbourhood", "Average_price_per_neighborhood")
top_10_lowest_neighbourhood <- top_10_lowest_neighbourhood[order(top_10_lowest_neighbourhood$Average_price_per_neighborhood),]
top_10_lowest_neighbourhood
##                  neighbourhood Average_price_per_neighborhood
## 7     杨浦区 / Yangpu District                       382.0541
## 3    宝山区 / Baoshan District                       418.7676
## 10   虹口区 / Hongkou District                       485.1142
## 1    嘉定区 / Jiading District                       513.5099
## 13   闵行区 / Minhang District                       604.6832
## 15   静安区 / Jing'an District                       643.1443
## 5      徐汇区 / Xuhui District                       648.9120
## 12 长宁区 / Changning District                       686.5140
## 11   金山区 / Jinshan District                       694.3520
## 6      普陀区 / Putuo District                       706.1377
## 16   黄浦区 / Huangpu District                       714.2647
## 9            浦东新区 / Pudong                       765.8575
## 2   奉贤区 / Fengxian District                       822.6498
## 8  松江区 / Songjiang District                      1019.5917
## 14    青浦区 / Qingpu District                      1159.4917
## 4  崇明区 / Chongming District                      1556.2554
top_10_lowest_neighbourhood <- tail(top_10_lowest_neighbourhood, 10)
r <- c()
for(i in 1:10){r <- c(r, i)}
row.names(top_10_lowest_neighbourhood) <- r
top_10_lowest_neighbourhood
##                  neighbourhood Average_price_per_neighborhood
## 1      徐汇区 / Xuhui District                       648.9120
## 2  长宁区 / Changning District                       686.5140
## 3    金山区 / Jinshan District                       694.3520
## 4      普陀区 / Putuo District                       706.1377
## 5    黄浦区 / Huangpu District                       714.2647
## 6            浦东新区 / Pudong                       765.8575
## 7   奉贤区 / Fengxian District                       822.6498
## 8  松江区 / Songjiang District                      1019.5917
## 9     青浦区 / Qingpu District                      1159.4917
## 10 崇明区 / Chongming District                      1556.2554
tema <- theme(plot.title = element_text(size = 15, hjust = .5),
              axis.text.x = element_text(size = 6, angle=15, face = "bold"),
              axis.text.y = element_text(size = 6, angle=10, face = "bold"),
              axis.title.x = element_text(size = 7),
              axis.title.y = element_text(size = 7),
              legend.position = "none")

options(repr.plot.width=20, repr.plot.height=10)
lowest_neighborhoods_plot_a <- ggplot(data = top_10_lowest_neighbourhood, mapping = aes(x = neighbourhood, y = Average_price_per_neighborhood)) +
  geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +
  geom_label(mapping = aes(label = round(Average_price_per_neighborhood, 2)), size = 3, fill = "#F5FFFA", fontface = "bold") +
  theme_ipsum() + 
  ggtitle("TOP 10 cheapest neighborhoods in Shanghai City ") +
  xlab("") +
  ylab("") +
  tema
lowest_neighborhoods_plot_a

#############################################
# The Relationship between Price and Reviews#
#############################################
ggplot(Airbnb_Shanghai_2021, aes(number_of_reviews, price)) +
  theme(axis.title = element_text(), axis.title.x = element_text()) +
  geom_point(aes(size = price), alpha = 0.05, color = "red") +
  background_canvas+
  xlab("Number of reviews") +
  ylab("Price") +
  ggtitle("Relationship between prices number of reviews",
          subtitle = "The most expensive houses have small number of reviews")

## The most expensive houses have small number of reviews
#############################################
   # Map for Airbnb House Distribution #
#############################################
#ggmap  - an object of class ggmap (from function get_map)
height <- max(Airbnb_Shanghai_2021$latitude) - min(Airbnb_Shanghai_2021$latitude)
width <- max(Airbnb_Shanghai_2021$longitude) - min(Airbnb_Shanghai_2021$longitude)
Canvas_borders <- c(bottom  = min(Airbnb_Shanghai_2021$latitude)  - 0.05 * height, 
                    top = max(Airbnb_Shanghai_2021$latitude)  + 0.05 * height,
                    left = min(Airbnb_Shanghai_2021$longitude) - 0.05 * width,
                    right = max(Airbnb_Shanghai_2021$longitude) + 0.05 * width)
# The full list of map types is “terrain”, “terrain-background”, “terrain-labels”, “terrain-lines”, “toner”, “toner-2010”, “toner-2011”, “toner-background”, “toner-hybrid”, “toner-labels”, “toner-lines”, “toner-lite”, “watercolor”. 
map <- get_stamenmap(Canvas_borders, zoom = 10, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/10/855/416.png
## Source : http://tile.stamen.com/toner-lite/10/856/416.png
## Source : http://tile.stamen.com/toner-lite/10/857/416.png
## Source : http://tile.stamen.com/toner-lite/10/858/416.png
## Source : http://tile.stamen.com/toner-lite/10/859/416.png
## Source : http://tile.stamen.com/toner-lite/10/855/417.png
## Source : http://tile.stamen.com/toner-lite/10/856/417.png
## Source : http://tile.stamen.com/toner-lite/10/857/417.png
## Source : http://tile.stamen.com/toner-lite/10/858/417.png
## Source : http://tile.stamen.com/toner-lite/10/859/417.png
## Source : http://tile.stamen.com/toner-lite/10/855/418.png
## Source : http://tile.stamen.com/toner-lite/10/856/418.png
## Source : http://tile.stamen.com/toner-lite/10/857/418.png
## Source : http://tile.stamen.com/toner-lite/10/858/418.png
## Source : http://tile.stamen.com/toner-lite/10/859/418.png
## Source : http://tile.stamen.com/toner-lite/10/855/419.png
## Source : http://tile.stamen.com/toner-lite/10/856/419.png
## Source : http://tile.stamen.com/toner-lite/10/857/419.png
## Source : http://tile.stamen.com/toner-lite/10/858/419.png
## Source : http://tile.stamen.com/toner-lite/10/859/419.png
## Source : http://tile.stamen.com/toner-lite/10/855/420.png
## Source : http://tile.stamen.com/toner-lite/10/856/420.png
## Source : http://tile.stamen.com/toner-lite/10/857/420.png
## Source : http://tile.stamen.com/toner-lite/10/858/420.png
## Source : http://tile.stamen.com/toner-lite/10/859/420.png
ggmap(map) +
  geom_point(data = Airbnb_Shanghai_2021, mapping = aes(x = longitude, y = latitude, 
                                          col = log(price))) +
  scale_color_distiller(palette = "RdYlGn", direction = 1)

#############################################
        # Price Prediction #
#############################################
Airbnb_Shanghai_2021 <- select(Airbnb_Shanghai_2021,-c(neighbourhood_group))
Airbnb_Shanghai_2021
## # A tibble: 36,294 x 15
##        id name  host_id host_name neighbourhood latitude longitude room_type
##     <dbl> <chr>   <dbl> <chr>     <chr>            <dbl>     <dbl> <chr>    
##  1  24963 Hear…   98203 Jia       徐汇区 / Xuhui …     31.2      121. Entire h…
##  2  24991 Fren…   98203 Jia       徐汇区 / Xuhui …     31.2      121. Entire h…
##  3 139828 【sid…  681552 Leon      普陀区 / Putuo …     31.2      121. Entire h…
##  4 161932 Subl…  774393 Michael   静安区 / Jing'a…     31.2      121. Entire h…
##  5 185736 Apt …  891951 Maggie    徐汇区 / Xuhui …     31.2      121. Private …
##  6 350728 'Lao… 1777552 Nitin     长宁区 / Changn…     31.2      121. Private …
##  7 427038 In t… 2122588 Mia       黄浦区 / Huangp…     31.2      121. Private …
##  8 479517 有简约 …  681552 Leon      静安区 / Jing'a…     31.2      121. Entire h…
##  9 479530 【sid…  681552 Leon      静安区 / Jing'a…     31.2      121. Entire h…
## 10 496972 Free… 2454164 Alvin     杨浦区 / Yangpu…     31.3      121. Private …
## # … with 36,284 more rows, and 7 more variables: price <dbl>,
## #   minimum_nights <dbl>, number_of_reviews <dbl>, last_review <date>,
## #   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## #   availability_365 <dbl>
Airbnb_Shanghai_2021 <- Airbnb_Shanghai_2021 %>% mutate(id = row_number())
airbnb_train <- Airbnb_Shanghai_2021 %>% sample_frac(.7) %>% filter(price > 0)
airbnb_test  <- anti_join(Airbnb_Shanghai_2021, airbnb_train, by = 'id') %>% filter(price > 0)
head(airbnb_train)
## # A tibble: 6 x 15
##      id name  host_id host_name neighbourhood latitude longitude room_type price
##   <int> <chr>   <dbl> <chr>     <chr>            <dbl>     <dbl> <chr>     <dbl>
## 1 33147 (奕可3…  2.83e8 蔡@君     浦东新区 / Pudon…     31.1      122. Entire h…   424
## 2  6263 [可长租…  7.63e7 小麦      浦东新区 / Pudon…     31.1      122. Entire h…   328
## 3 35248 【特惠八…  3.04e8 Mizao Di… 浦东新区 / Pudon…     31.1      122. Entire h…  3826
## 4 27799 Skyv…  3.85e6 Hope      黄浦区 / Huangp…     31.2      121. Private …   316
## 5 25029 【五星推…  1.62e8 Ruiyin    崇明区 / Chongm…     31.7      121. Private …   498
## 6  6663 上海杨浦…  1.04e8 闲主      杨浦区 / Yangpu…     31.3      122. Entire h…   279
## # … with 6 more variables: minimum_nights <dbl>, number_of_reviews <dbl>,
## #   last_review <date>, reviews_per_month <dbl>,
## #   calculated_host_listings_count <dbl>, availability_365 <dbl>
# sanity check
nrow(airbnb_train) + nrow(airbnb_test) == nrow(Airbnb_Shanghai_2021 %>% filter(price > 0))
## [1] TRUE
#Model1
model1<- lm(log(price)~ minimum_nights+number_of_reviews+reviews_per_month+availability_365+latitude+longitude+calculated_host_listings_count+room_type+neighbourhood, data = airbnb_train)
summary(model1)
## 
## Call:
## lm(formula = log(price) ~ minimum_nights + number_of_reviews + 
##     reviews_per_month + availability_365 + latitude + longitude + 
##     calculated_host_listings_count + room_type + neighbourhood, 
##     data = airbnb_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3906 -0.4454 -0.1145  0.3010  6.3838 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               3.915e+01  1.757e+01   2.228  0.02587
## minimum_nights                           -1.533e-04  2.389e-04  -0.642  0.52094
## number_of_reviews                         6.889e-05  2.659e-04   0.259  0.79561
## reviews_per_month                        -6.459e-02  7.428e-03  -8.695  < 2e-16
## availability_365                         -1.000e-04  4.637e-05  -2.157  0.03103
## latitude                                 -4.289e-01  1.625e-01  -2.640  0.00831
## longitude                                -1.638e-01  1.161e-01  -1.411  0.15824
## calculated_host_listings_count           -4.034e-04  2.026e-04  -1.991  0.04648
## room_typePrivate room                    -5.965e-01  1.379e-02 -43.267  < 2e-16
## room_typeShared room                     -1.346e+00  3.903e-02 -34.485  < 2e-16
## neighbourhood奉贤区 / Fengxian District   1.285e-01  9.920e-02   1.295  0.19540
## neighbourhood宝山区 / Baoshan District    4.853e-02  6.927e-02   0.701  0.48355
## neighbourhood崇明区 / Chongming District  1.464e+00  9.549e-02  15.335  < 2e-16
## neighbourhood徐汇区 / Xuhui District      3.138e-01  5.274e-02   5.950 2.75e-09
## neighbourhood普陀区 / Putuo District      2.670e-01  6.457e-02   4.135 3.57e-05
## neighbourhood杨浦区 / Yangpu District     1.913e-01  6.673e-02   2.866  0.00416
## neighbourhood松江区 / Songjiang District  2.185e-01  6.948e-02   3.144  0.00167
## neighbourhood浦东新区 / Pudong            5.248e-01  5.860e-02   8.955  < 2e-16
## neighbourhood虹口区 / Hongkou District    2.573e-01  6.203e-02   4.147 3.39e-05
## neighbourhood金山区 / Jinshan District    6.232e-01  1.336e-01   4.663 3.14e-06
## neighbourhood长宁区 / Changning District  2.354e-01  5.730e-02   4.109 4.00e-05
## neighbourhood闵行区 / Minhang District    1.233e-01  5.892e-02   2.093  0.03639
## neighbourhood青浦区 / Qingpu District     6.434e-01  6.623e-02   9.715  < 2e-16
## neighbourhood静安区 / Jing'an District    3.073e-01  5.346e-02   5.747 9.28e-09
## neighbourhood黄浦区 / Huangpu District    4.409e-01  5.207e-02   8.467  < 2e-16
##                                             
## (Intercept)                              *  
## minimum_nights                              
## number_of_reviews                           
## reviews_per_month                        ***
## availability_365                         *  
## latitude                                 ** 
## longitude                                   
## calculated_host_listings_count           *  
## room_typePrivate room                    ***
## room_typeShared room                     ***
## neighbourhood奉贤区 / Fengxian District     
## neighbourhood宝山区 / Baoshan District      
## neighbourhood崇明区 / Chongming District ***
## neighbourhood徐汇区 / Xuhui District     ***
## neighbourhood普陀区 / Putuo District     ***
## neighbourhood杨浦区 / Yangpu District    ** 
## neighbourhood松江区 / Songjiang District ** 
## neighbourhood浦东新区 / Pudong           ***
## neighbourhood虹口区 / Hongkou District   ***
## neighbourhood金山区 / Jinshan District   ***
## neighbourhood长宁区 / Changning District ***
## neighbourhood闵行区 / Minhang District   *  
## neighbourhood青浦区 / Qingpu District    ***
## neighbourhood静安区 / Jing'an District   ***
## neighbourhood黄浦区 / Huangpu District   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7267 on 13692 degrees of freedom
##   (11683 observations deleted due to missingness)
## Multiple R-squared:  0.225,  Adjusted R-squared:  0.2237 
## F-statistic: 165.7 on 24 and 13692 DF,  p-value: < 2.2e-16
AIC(model1)
## [1] 30194.74
BIC(model1)
## [1] 30390.43
airbnb_trained_filtered <- airbnb_train %>% filter(price < quantile(airbnb_train$price, 0.9) & price > quantile(airbnb_train$price, 0.1))%>%drop_na()
# learn
# View(learn)
## Model2
model2<- lm(log(price)~ number_of_reviews+reviews_per_month+availability_365+latitude+calculated_host_listings_count+room_type+neighbourhood, data = airbnb_trained_filtered)
summary(model2)
## 
## Call:
## lm(formula = log(price) ~ number_of_reviews + reviews_per_month + 
##     availability_365 + latitude + calculated_host_listings_count + 
##     room_type + neighbourhood, data = airbnb_trained_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.12889 -0.31352 -0.04667  0.27337  1.45660 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               9.751e-01  2.742e+00   0.356 0.722185
## number_of_reviews                         7.536e-04  1.719e-04   4.383 1.18e-05
## reviews_per_month                        -5.387e-02  4.752e-03 -11.336  < 2e-16
## availability_365                         -5.567e-05  3.036e-05  -1.834 0.066726
## latitude                                  1.552e-01  8.759e-02   1.772 0.076454
## calculated_host_listings_count           -2.518e-04  1.277e-04  -1.972 0.048636
## room_typePrivate room                    -2.573e-01  9.177e-03 -28.032  < 2e-16
## room_typeShared room                     -4.773e-01  5.315e-02  -8.980  < 2e-16
## neighbourhood奉贤区 / Fengxian District  -7.146e-02  6.815e-02  -1.049 0.294354
## neighbourhood宝山区 / Baoshan District   -3.859e-02  4.642e-02  -0.831 0.405743
## neighbourhood崇明区 / Chongming District  4.552e-01  4.838e-02   9.409  < 2e-16
## neighbourhood徐汇区 / Xuhui District      2.464e-01  3.510e-02   7.018 2.37e-12
## neighbourhood普陀区 / Putuo District      1.134e-01  4.340e-02   2.614 0.008965
## neighbourhood杨浦区 / Yangpu District     6.057e-02  4.112e-02   1.473 0.140721
## neighbourhood松江区 / Songjiang District  5.653e-02  4.515e-02   1.252 0.210608
## neighbourhood浦东新区 / Pudong            2.199e-01  3.488e-02   6.305 2.98e-10
## neighbourhood虹口区 / Hongkou District    1.080e-01  3.887e-02   2.779 0.005465
## neighbourhood金山区 / Jinshan District    5.822e-01  8.096e-02   7.191 6.82e-13
## neighbourhood长宁区 / Changning District  1.408e-01  3.827e-02   3.680 0.000234
## neighbourhood闵行区 / Minhang District    1.272e-01  3.985e-02   3.191 0.001424
## neighbourhood青浦区 / Qingpu District     3.806e-01  4.168e-02   9.132  < 2e-16
## neighbourhood静安区 / Jing'an District    1.639e-01  3.499e-02   4.684 2.85e-06
## neighbourhood黄浦区 / Huangpu District    2.829e-01  3.365e-02   8.407  < 2e-16
##                                             
## (Intercept)                                 
## number_of_reviews                        ***
## reviews_per_month                        ***
## availability_365                         .  
## latitude                                 .  
## calculated_host_listings_count           *  
## room_typePrivate room                    ***
## room_typeShared room                     ***
## neighbourhood奉贤区 / Fengxian District     
## neighbourhood宝山区 / Baoshan District      
## neighbourhood崇明区 / Chongming District ***
## neighbourhood徐汇区 / Xuhui District     ***
## neighbourhood普陀区 / Putuo District     ** 
## neighbourhood杨浦区 / Yangpu District       
## neighbourhood松江区 / Songjiang District    
## neighbourhood浦东新区 / Pudong           ***
## neighbourhood虹口区 / Hongkou District   ** 
## neighbourhood金山区 / Jinshan District   ***
## neighbourhood长宁区 / Changning District ***
## neighbourhood闵行区 / Minhang District   ** 
## neighbourhood青浦区 / Qingpu District    ***
## neighbourhood静安区 / Jing'an District   ***
## neighbourhood黄浦区 / Huangpu District   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4318 on 11305 degrees of freedom
## Multiple R-squared:  0.1204, Adjusted R-squared:  0.1187 
## F-statistic: 70.33 on 22 and 11305 DF,  p-value: < 2.2e-16
AIC(model2)
## [1] 13146.25
BIC(model2)
## [1] 13322.29
References

[1] https://rstudio.github.io/DT/ DT package instruction

[2] https://cran.r-project.org/web/packages/hrbrthemes/hrbrthemes.pdf hrbrthemes package instruction

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.